Analysis of single-cell RNA-Seq data of Mouse spematogenesis (Accession no: E-MTAB-6946)

Author: Prabhakaran Munusamy

In [1]:
%config InlineBackend.figure_formats = ["retina"]
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import scanpy as sc
from scipy import stats
from  more_itertools import unique_everseen
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.n_jobs = 16
scanpy==1.4.7.dev82+ge1cd0d8.d20200805 anndata==0.7.1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.3 scikit-learn==0.22.1 statsmodels==0.11.0rc2 python-igraph==0.7.1 leidenalg==0.7.0
In [2]:
import anndata2ri
anndata2ri.activate()
In [3]:
%reload_ext rpy2.ipython
In [4]:
plt.rcParams.update({"font.family":"Reem Kufi"})
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
In [5]:
heatcolors = matplotlib.colors.LinearSegmentedColormap.from_list("", ["gray", "#000004FF", "#330A5FFF", "#781C6DFF","#BB3754FF",
                                                                      "#ED6925FF", "#FCB519FF", "#FCFFA4FF"])

heatcolors_wr = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "#FFF5F0", "#FEE0D2", "#FCBBA1", "#FC9272",
                                                                         "#FB6A4A", "#EF3B2C", "#CB181D", "#A50F15", "#67000D"])
In [26]:
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
In [7]:
%%R
suppressWarnings(suppressMessages(require(scran)))
suppressWarnings(suppressMessages(require(scater)))
suppressWarnings(suppressMessages(library(DropletUtils)))
suppressWarnings(suppressMessages(library(batchelor)))
options(mc.cores = 24L)
In [8]:
%%R
# Load the individual data
# P5
do26386 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do26386_count/outs/filtered_feature_bc_matrix/")
colData(do26386)$Sample <- rep("P5", ncol(do26386))
colData(do26386)$Library <- rep("do26386", ncol(do26386))
do26387 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do26387_count/outs/filtered_feature_bc_matrix/")
colData(do26387)$Sample <- rep("P5", ncol(do26387))
colData(do26387)$Library <- rep("do26387", ncol(do26387))
# P10
do17821 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17821_count/outs/filtered_feature_bc_matrix/")
colData(do17821)$Sample <- rep("P10", ncol(do17821))
colData(do17821)$Library <- rep("do17821", ncol(do17821))
# P15
do18195 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do18195_count/outs/filtered_feature_bc_matrix/")
colData(do18195)$Sample <- rep("P15", ncol(do18195))
colData(do18195)$Library <- rep("do18195", ncol(do18195))
# P20
do17824 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17824_count/outs/filtered_feature_bc_matrix/")
colData(do17824)$Sample <- rep("P20", ncol(do17824))
colData(do17824)$Library <- rep("do17824", ncol(do17824))
# P25
do18196 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do18196_count/outs/filtered_feature_bc_matrix/")
colData(do18196)$Sample <- rep("P25", ncol(do18196))
colData(do18196)$Library <- rep("do18196", ncol(do18196))
# P30
do17825 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17825_count/outs/filtered_feature_bc_matrix/")
colData(do17825)$Sample <- rep("P30", ncol(do17825))
colData(do17825)$Library <- rep("do17825", ncol(do17825))
# P35
do17827 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17827_count/outs/filtered_feature_bc_matrix/")
colData(do17827)$Sample <- rep("P35", ncol(do17827))
colData(do17827)$Library <- rep("do17827", ncol(do17827))
# B6
do17815 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17815_count/outs/filtered_feature_bc_matrix/")
colData(do17815)$Sample <- rep("B6", ncol(do17815))
colData(do17815)$Library <- rep("do17815", ncol(do17815))
do17816 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17816_count/outs/filtered_feature_bc_matrix/")
colData(do17816)$Sample <- rep("B6", ncol(do17816))
colData(do17816)$Library <- rep("do17816", ncol(do17816))
In [9]:
%%R -o ms_sce
# Merge all datasets
ms_sce <- cbind(do26386, do26387, do17821, do18195, do17824, do18196, do17825, do17827, do17815, do17816)
rownames(ms_sce) <- rowData(ms_sce)$Symbol
print(ms_sce)
class: SingleCellExperiment 
dim: 31053 30700 
metadata(10): Samples Samples ... Samples Samples
assays(1): counts
rownames(31053): Xkr4 Gm1992 ... Vmn2r122 CAAA01147332.1
rowData names(3): ID Symbol Type
colnames: NULL
colData names(3): Sample Barcode Library
reducedDimNames(0):
spikeNames(0):
altExpNames(0):
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [10]:
%%R
bcrank <- barcodeRanks(counts(ms_sce))

# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
    xlab="Rank", ylab="Total UMI count", cex.lab=1.2)

abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)

legend("bottomleft", legend=c("Knee"), 
        col=c("dodgerblue"), lty=2, cex=1.2)
In [11]:
%%R
MT.Genes <- grep("^mt-", rownames(ms_sce), value = T)
print(MT.Genes)
 [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
 [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
In [12]:
%%R
# Calculate QC metrics
ms_sce <- addPerCellQC(ms_sce, subsets = list(mt=MT.Genes))

# Remove low quality cells i.e., those with less than 200 genes expressed and high mt%
ms_sce <- ms_sce[,ms_sce$detected > 200 & ms_sce$subsets_mt_percent < 20] 

# Remove unxpressed genes 
ms_sce <- ms_sce[Matrix::rowSums(counts(ms_sce)) > 0, ]
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [13]:
%%R
print(ms_sce)
class: SingleCellExperiment 
dim: 26845 30668 
metadata(10): Samples Samples ... Samples Samples
assays(1): counts
rownames(26845): Xkr4 Gm1992 ... Vmn2r122 CAAA01147332.1
rowData names(3): ID Symbol Type
colnames: NULL
colData names(13): Sample Barcode ... subsets_mt_percent total
reducedDimNames(0):
spikeNames(0):
altExpNames(0):
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [14]:
%%R
# Normalization
set.seed(55063)
clusters <- quickCluster(ms_sce)
ms_sce <- computeSumFactors(ms_sce, cluster = clusters)
ms_sce <- logNormCounts(ms_sce)

set.seed(55063)
mod_ms <- modelGeneVar(ms_sce)
hvgs_ms <- getTopHVGs(mod_ms, prop = 0.1)

length(hvgs_ms)
[1] 1376
In [15]:
%%R
# Dimensionality reduction
set.seed(55063)
ms_sce <- runPCA(ms_sce, subset_row = hvgs_ms)
ms_sce <- runTSNE(ms_sce, dimred = "PCA", num_threads = 24, verbose = F)
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [16]:
%%R -o ms_sce
# Perform Batch correction
# Identify HVGs based on sample
set.seed(55063)
mod_ms_batch <- modelGeneVar(ms_sce, block = colData(ms_sce)$Sample)
hvgs_ms_batch <- getTopHVGs(mod_ms_batch, prop = 0.1)

set.seed(55063)
ms_sce_corrected <- fastMNN(ms_sce, subset.row = hvgs_ms_batch, batch=ms_sce$Sample)

set.seed(55063)
ms_sce_corrected <- runTSNE(ms_sce_corrected, dimred = "corrected", perplexity=300, num_threads = 12, verbose = F)
reducedDim(ms_sce, "mnn") <- reducedDim(ms_sce_corrected, "corrected")
reducedDim(ms_sce, "mnnTSNE") <- reducedDim(ms_sce_corrected, "TSNE")
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [17]:
%%R
length(hvgs_ms_batch)
[1] 1642
In [18]:
ms_sce.var_names_make_unique()
In [19]:
ms_sce.obs['Sample'].value_counts()
Out[19]:
P5     7999
P25    4400
P15    4300
B6     3399
P10    3271
P35    3199
P30    2300
P20    1800
Name: Sample, dtype: int64
In [20]:
ms_sce.obs['sum'].describe()
Out[20]:
count     30668.000000
mean      19504.557911
std       18543.629476
min        2640.000000
25%        6976.000000
50%       14286.500000
75%       25177.250000
max      267555.000000
Name: sum, dtype: float64
In [21]:
ms_sce.obs['detected'].describe()
Out[21]:
count    30668.000000
mean      3699.166069
std       1506.911002
min        426.000000
25%       2478.000000
50%       3635.500000
75%       4778.000000
max       9777.000000
Name: detected, dtype: float64
In [27]:
sc.pl.tsne(ms_sce, color = ["Sample"], 
           size = 10, 
           edgecolor = "black", 
           linewidths = 0.1,
           legend_fontsize = 8, title="Before batch correction")
In [28]:
ms_sce.obsm['X_tsne'] = ms_sce.obsm['mnnTSNE']
In [29]:
sc.pl.tsne(ms_sce, color = ["Sample"], 
           size = 10, 
           edgecolor = "black", 
           linewidths = 0.1, title="After batch correction",
           legend_fontsize = 8)
In [30]:
ms_sce.obsm["X_pca"] = ms_sce.obsm["mnn"]
In [31]:
sc.pp.neighbors(ms_sce, n_neighbors = 5)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:18)
In [32]:
sc.tl.leiden(ms_sce, resolution = 2, key_added = 'leiden_2')
running Leiden clustering
    finished: found 60 clusters and added
    'leiden_2', the cluster labels (adata.obs, categorical) (0:00:07)
In [33]:
sc.pl.tsne(ms_sce, color=["leiden_2"], 
           size=8, 
           legend_loc = "on data", 
           edgecolor = "black", 
           linewidths = 0.1,
           ncols = 3, 
           legend_fontsize = 8, 
           alpha=0.2)
In [34]:
wv = sc.pl.stacked_violin(ms_sce, groupby="leiden_2", var_names=["Ddx4", "Utf1", "Uchl1", "Id4", "Rec8", "Stra8", "Aurka", "Acrv1", 
                                                            "Prm2", "Vim", "Amh", "Cd74", "Vwf", "Acta2", "Tagln", "Clu", "Sox9", 
                                                                 "Insl3", "Inhba", "Dcn", "Sparc"],
                    swap_axes=True, layer="logcounts")
In [35]:
ms_leiden_clusters = ms_sce.obs['leiden_2']
In [36]:
%%R -i ms_leiden_clusters
ms_sce$ms_leiden_clusters = ms_leiden_clusters
ms_sce$broadcluster <- as.character(ms_sce$ms_leiden_clusters)
In [37]:
%%R
ms_sce$broadcluster[ms_sce$broadcluster %in% c("7", "0", "16", "27", "2", "54", "9", "21", "41", "38", "12", "49", "43", "46", "52", "1", "48", "53")] = "Somatic"

ms_sce$broadcluster[ms_sce$broadcluster %in% c("17", "51", "42", "47", "40", "55", "56", "59", "57", "58", "45", "50")] = "Outliers"

ms_sce$broadcluster[!ms_sce$broadcluster %in% c("Outliers", "Somatic")] = "Germ"

Mouse germ and somatic cells analysis

In [38]:
%%R 
# Seperate Germ and somatic cells
ms_sce_gs <- ms_sce[,ms_sce$broadcluster %in% c("Germ", "Somatic")]
ms_sce_gs <- ms_sce_gs[Matrix::rowSums(counts(ms_sce_gs)) > 0, ]
assay(ms_sce_gs, "normcounts") <- as(((2^logcounts(ms_sce_gs))-1), "dgCMatrix")
ms_sce_gs$ms_leiden_clusters <- as.character(ms_sce_gs$ms_leiden_clusters)
In [39]:
%%R -o ms_sce_gs
print(ms_sce_gs)
class: SingleCellExperiment 
dim: 26728 28529 
metadata(10): Samples Samples ... Samples Samples
assays(3): counts logcounts normcounts
rownames(26728): Xkr4 Gm1992 ... Vmn2r122 CAAA01147332.1
rowData names(3): ID Symbol Type
colnames: NULL
colData names(15): Sample Barcode ... ms_leiden_clusters broadcluster
reducedDimNames(4): PCA TSNE mnn mnnTSNE
spikeNames(0):
altExpNames(0):
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [40]:
ms_sce_gs.obs.index = ms_sce_gs.obs['Barcode'].astype('str')+'-'+ms_sce_gs.obs['Sample'].astype('str')
In [41]:
ms_sce_gs.var_names_make_unique()
In [42]:
ms_sce_gs.obsm['X_tsne']=ms_sce_gs.obsm['mnnTSNE']
In [43]:
gsconditions = [
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["1", "52"])),#Myoid
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["0", "2", "7", "9", "16", "21", "27"])),#Immature Leydig
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["41", "38", "12", "49", "43"])),#Leydig
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["54"])),#Sertoli
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["48", "53"])),#Macrophage
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["46"])),#Endothelial
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["44", "5", "18"])),#Spermatogonia
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["39", "33", "6", "8", "4", "3", "14", "10", "30", "26", "15", "11", "13"])),#Spermatocytes
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["19", "37", "20", "24", "31", "22", "29", "36"])),#Round spermatids
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["34", "25", "23", "28","32"])),#Elongating Spermatids
    (ms_sce_gs.obs["ms_leiden_clusters"].isin(["35"]))]#Sperm
gsgroups = ["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial", "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"]
In [44]:
gs_clusters = np.select(gsconditions, gsgroups, default="Germ")
In [45]:
ms_sce_gs.obs["broad_clusters"] = gs_clusters
In [46]:
ms_sce_gs.obs["broad_clusters"] = ms_sce_gs.obs["broad_clusters"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial", 
                                                                                                             "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"],ordered=True)
In [47]:
ms_sce_gs.obs['ms_leiden_clusters_celltype'] = ms_sce_gs.obs['ms_leiden_clusters'].astype(str) +"-"+ms_sce_gs.obs['broad_clusters'].astype(str)
In [48]:
cols_clusters = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red", "#F16913",  "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]
In [49]:
sc.pl.tsne(ms_sce_gs, color=["broad_clusters"], legend_fontsize=10, palette=cols_clusters, 
           size=20, edgecolor="black", linewidths=0.1, title="")
... storing 'Sample' as categorical
... storing 'Barcode' as categorical
... storing 'Library' as categorical
... storing 'ms_leiden_clusters' as categorical
... storing 'broadcluster' as categorical
... storing 'ms_leiden_clusters_celltype' as categorical
... storing 'Symbol' as categorical
... storing 'Type' as categorical
In [50]:
sc.tl.dendrogram(ms_sce_gs, groupby="ms_leiden_clusters_celltype")
    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_ms_leiden_clusters_celltype']`
In [51]:
sc.pl.correlation_matrix(ms_sce_gs, groupby="ms_leiden_clusters_celltype", figsize=(10,6))
In [52]:
sc.tl.rank_genes_groups(ms_sce_gs, "broad_clusters", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:02:14)
In [53]:
gslist = {}
for i in ms_sce_gs.obs["broad_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(ms_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25, 
                                        key="rank_genes_groups").sort_values(by=["logfoldchanges"], ascending=[False]).dropna(axis=0)["names"].to_list()
    gslist[i] = genes
In [54]:
for key, value in gslist.items():
    #print value
    print(key, len([item for item in value if item]))
Myoid 2679
Immature Leydig 2423
Leydig 2756
Sertoli 605
Macrophage 1359
Endothelial 2420
Spermatogonia 2719
Spermatocytes 3546
Round spermatids 4021
Elongating spermatids 4643
Sperm 2133
In [55]:
gs_L1 = []
for i in ms_sce_gs.obs["broad_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(ms_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    gs_L1.extend(genes)
In [56]:
gs_unique_genes = list(unique_everseen(gs_L1))
len(gs_unique_genes)
Out[56]:
15154
In [57]:
gs_mat = sc.pl.matrixplot(ms_sce_gs, gs_unique_genes, groupby="broad_clusters", figsize=(4, 4),standard_scale="var",
                          linewidth=.000001,swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts",
                          show=False, show_gene_labels=False)

Analysis of mouse germ cells

In [58]:
%%R -o ms_sce_germ
## Germ
ms_sce_germ <- ms_sce[,ms_sce$broadcluster=="Germ"]
ms_sce_germ <- ms_sce_germ[Matrix::rowSums(counts(ms_sce_germ)) > 0, ]
ms_sce_germ$ms_leiden_clusters <- factor(ms_sce_germ$ms_leiden_clusters)

# Identify HVGs based on sample
set.seed(55063)
mod_ms_batch_germ <- modelGeneVar(ms_sce_germ, block=colData(ms_sce_germ)$Sample)
hvgs_ms_batch_germ <- getTopHVGs(mod_ms_batch_germ, prop=0.1)
print(length(hvgs_ms_batch_germ))

set.seed(55063)
ms_sce_germ_corrected <- fastMNN(ms_sce_germ, subset.row=hvgs_ms_batch_germ, batch=ms_sce_germ$Sample)
reducedDim(ms_sce_germ, "mnn") <- reducedDim(ms_sce_germ_corrected, "corrected")
ms_sce_germ$ms_leiden_clusters <- as.character(ms_sce_germ$ms_leiden_clusters)
[1] 1607
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [59]:
ms_sce_germ.obs.index = ms_sce_germ.obs['Barcode'].astype('str')+'-'+ms_sce_germ.obs['Sample'].astype('str')
In [60]:
ms_sce_germ.var_names_make_unique()
In [61]:
ms_sce_germ.obs['sum'].describe()
Out[61]:
count     18288.000000
mean      26263.559383
std       19671.580012
min        3263.000000
25%       13460.750000
50%       20608.500000
75%       32874.250000
max      267555.000000
Name: sum, dtype: float64
In [62]:
ms_sce_germ.obs['detected'].describe()
Out[62]:
count    18288.000000
mean      4367.063867
std       1245.019657
min       1337.000000
25%       3366.000000
50%       4336.000000
75%       5229.000000
max       9777.000000
Name: detected, dtype: float64
In [63]:
ms_sce_germ.obsm['X_tsne'] = ms_sce_germ.obsm['mnnTSNE']
In [64]:
markers = [g.capitalize() for g in ["UTF1", "ID4","FGFR3", "SOHLH1", "UCHL1","DMRT1", "KIT", "STRA8", "REC8",
                              "PRDM9","SPO11","DMC1","MEIOB", "BRCA2","ATR","SYCP1","SMC1B", "SMC3","PIWIL1", "PIWIL2", "RAD51", 
                              "SYCP2","SYCP3","HORMAD1","HORMAD2", "AURKA", "PLK1",
                              "NME5","SPACA1", "ACRV1", "AKAP14","TNP2","PRM2", "CRISP2", "TPPP2", "OAZ3"]]
In [65]:
sc.pl.tsne(ms_sce_germ, color=markers, legend_fontsize=8, color_map=heatcolors, 
           size=20, edgecolor="black", linewidths=0.1,wspace=0.1, hspace=0.2, ncols=4, layer="logcounts")
... storing 'Sample' as categorical
... storing 'Barcode' as categorical
... storing 'Library' as categorical
... storing 'ms_leiden_clusters' as categorical
... storing 'broadcluster' as categorical
... storing 'Symbol' as categorical
... storing 'Type' as categorical
In [66]:
ms_sce_germ.obsm["X_pca"] = ms_sce_germ.obsm["mnn"]
In [67]:
ms_sce_germ.obs["ms_leiden_clusters"].cat.reorder_categories(['44', '5', '18', '39', '33', '6', '8', '4', '3',
                                                              '14', '10', '30', '26', '15', '11', '13', '19',
                                                              '37', '20', '24', '31', '22', '29', '36', '34', 
                                                              '25', '23', '28','32', '35'], inplace = True)
In [68]:
germADconditions = [
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["44"])),#SPG1
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["5"])),#SPG2
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["18"])),#Diff SPG / pre-Lep
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["39"])),#Lep
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["33"])),#Zyg1
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["6"])),#Zyg2
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["8"])),#Zyg3
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["4"])),#Zyg4
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["3"])),#P1
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["14"])),#P2
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["10"])),#P3
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["30"])),#P4
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["26"])),#P5
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["15"])),#P/D
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["11"])),#D
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["13"])),#Sec. Spermatocytes
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["19"])),#RS1
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["37"])),#RS2
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["20"])),#RS3 
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["24"])),#RS4
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["31"])),#RS5
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["22"])),#RS6
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["29"])),#RS7
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["36"])),#RS8
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["34"])),#ES1
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["25"])),#ES2
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["23"])),#ES3
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["28"])),#ES4
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["32"])),#ES5
    (ms_sce_germ.obs["ms_leiden_clusters"].isin(["35"]))]#Sperm
germADgroups = ["Undiff1","Undiff2","Diff / pre-Lep", "Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4", "Pach1", "Pach2",
                "Pach3", "Pach4", "Pach5", "Pach6", "Dip", "Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8", "ES1","ES2","ES3","ES4","ES5","Sperm"]
In [69]:
germAD_clusters = np.select(germADconditions, germADgroups, default="Germ")
In [70]:
ms_sce_germ.obs["germAD_clusters"] = germAD_clusters
In [71]:
ms_sce_germ.obs["germAD_clusters"] = ms_sce_germ.obs["germAD_clusters"].astype("category").cat.reorder_categories(["Undiff1","Undiff2","Diff / pre-Lep", 
                                                                                                                   "Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4", 
                                                                                                                   "Pach1", "Pach2", "Pach3", "Pach4", "Pach5", "Pach6", 
                                                                                                                   "Dip", "Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8",
                                                                                                                   "ES1","ES2","ES3","ES4","ES5","Sperm"],ordered=True)
In [72]:
cols_clusters = ["#FEE6CE", "#FDAE6B", "#E6550D",
                 "#F7FCF5", "#E2ECE2", "#CDDDD0", "#B9CEBE", "#A4BEAC", "#90AF9A", "#7BA088", "#669075", "#528163", "#3D7151", "#29623F", "#14532D", "#00441B",
                 "#F7FBFF", "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
                 "#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
In [73]:
sc.pl.tsne(ms_sce_germ, color=["germAD_clusters"], size=10, palette=cols_clusters,
            edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
In [74]:
ms_sce_germ.obs['Annotated'] = ms_sce_germ.obs['germAD_clusters']
In [75]:
ms_sce_gs.obs['Annotated'] = ms_sce_gs.obs['broad_clusters']
ms_sce_gs.obs['Annotated'] = ms_sce_gs.obs['Annotated'].astype('str')

Visualize the annotated germ and somatic celltypes integrated on tSNE

In [76]:
for idx, x in ms_sce_germ.obs['germAD_clusters'].iteritems():
    ms_sce_gs.obs.at[idx, 'Annotated'] = x 
In [77]:
ms_sce_gs.obs["Annotated"] = ms_sce_gs.obs["Annotated"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial",
                                                                                                     "Undiff1","Undiff2","Diff / pre-Lep", "Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4",
                                                                                                   "Pach1", "Pach2", "Pach3", "Pach4", "Pach5", "Pach6", "Dip", "Sec. Spcs",
                                                                                                   "RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8",
                                                                                                   "ES1","ES2","ES3","ES4","ES5","Sperm"],ordered=True)
In [78]:
palette = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red",
           "#FEE6CE", "#FDAE6B", "#E6550D",
                 "#F7FCF5", "#E2ECE2", "#CDDDD0", "#B9CEBE", "#A4BEAC", "#90AF9A", "#7BA088", "#669075", "#528163", "#3D7151", "#29623F", "#14532D", "#00441B",
                 "#F7FBFF", "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
                 "#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
In [79]:
sc.pl.tsne(ms_sce_gs, color=["Annotated"], size=10, palette=palette,
            edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
In [80]:
sc.pp.neighbors(ms_sce_germ, n_neighbors=5) # PAGA - Partition-based graph abstraction
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
In [81]:
sc.tl.paga(ms_sce_germ, groups="germAD_clusters") # PAGA - Partition-based graph abstraction
running PAGA
    finished: added
    'paga/connectivities', connectivities adjacency (adata.uns)
    'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:01)
In [82]:
sc.pl.paga(ms_sce_germ, color="germAD_clusters", threshold=0.2, fontsize=8, fontoutline=1, frameon=False)
--> added 'pos', the PAGA positions (adata.uns['paga'])
In [83]:
 ms_sce_germ.obs["Annotated"] = ms_sce_germ.obs["germAD_clusters"]
In [84]:
sc.tl.rank_genes_groups(ms_sce_germ, "germAD_clusters", n_genes=6000, method="wilcoxon", layer="logcounts", use_raw=False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:02:10)
In [85]:
germlist = {}
for i in ms_sce_germ.obs["germAD_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(ms_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, 
                                        key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    germlist[i] = genes
In [86]:
germ_L1 = []
for i in ms_sce_germ.obs["germAD_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(ms_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    germ_L1.extend(genes) 
In [87]:
germ_unique_genes = list(unique_everseen(germ_L1))
len(germ_unique_genes)
Out[87]:
13636
In [88]:
ax = sc.pl.matrixplot(ms_sce_germ, germ_L1, groupby="germAD_clusters", figsize=(10,4),standard_scale="var", linewidth=.000001, swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
In [89]:
ms_sce_germ.X = ms_sce_germ.layers["logcounts"]
In [90]:
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Mouse/Mouse_genes_annotation_gtf.csv")
allgenes.index = allgenes['gene_name'].tolist()
allgenes = allgenes[['gene_id', 'seqname']]
allgenes.columns = ['GeneStableID', 'Chromosome']
In [91]:
allgenes = allgenes[allgenes['Chromosome'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])]
In [92]:
for ch in allgenes['Chromosome'].astype("category").cat.categories.tolist():
    genes = allgenes["Chromosome"]==ch
    Agenes = set(allgenes[genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
    ms_sce_germ.obs[ch] = np.ravel((ms_sce_germ[:, list(Agenes)].X != 0).sum(axis=1))
In [93]:
AllChr_df = pd.DataFrame()
for column in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y']:
    x = ms_sce_germ.obs[column]
    AllChr_df[column] = x
    
AllChr_df['germAD_clusters'] = ms_sce_germ.obs['germAD_clusters']
AllChr_df_melted = AllChr_df.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
In [94]:
AllChr_df_melted_mean = AllChr_df_melted.groupby(["germAD_clusters", "key"], as_index=False).mean()
# Reshape the data
AllChr_df_mean_reshaped = AllChr_df_melted_mean.pivot(index="key",columns="germAD_clusters")
In [95]:
AllChr_df_melted_std = AllChr_df_melted.groupby(["germAD_clusters", "key"]).std().reset_index()
AllChr_df_melted_std_reshaped = AllChr_df_melted_std.pivot(index="key",columns="germAD_clusters")
In [96]:
plt.rcParams["axes.linewidth"] = 0.2
plt.rcParams["figure.figsize"]=6,3
cols = ["#87CEEB", "#83C8EA", "#7FC3E9", "#7BBEE9", "#78B8E8", "#74B3E8", 
"#70AEE7", "#6DA8E7", "#69A3E6", "#659EE6", "#6298E5", "#5E93E5", 
"#5A8EE4", "#5788E4", "#5383E3", "#4F7EE3", "#4C78E2", "#4873E2", 
"#446EE1", "darkorange"]

i=0
for c in ['1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'Y']:#range(23):
    plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(), 
             AllChr_df_mean_reshaped.loc[c], linewidth=0.75, color=cols[i], label = c,
             alpha=1)
    i+=1

    
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(), 
             AllChr_df_mean_reshaped.loc['X'], linewidth=1.5, marker='', color= "#C70039", label="X")

# Add legend
plt.legend(loc="upper right", ncol=2, fontsize=6, frameon=False, bbox_to_anchor=(1.15, 1))
# Add titles
plt.ylabel("Mean no. of genes", fontsize=8)
plt.xlabel("")
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor")
plt.tick_params(
    axis="both",          # changes apply to the x-axis
    which="both",      # both major and minor ticks are affected
    bottom=True,      # ticks along the bottom edge are off
    top=False,  left=True,       # ticks along the top edge are off
    labelbottom=True, pad=0.5, length=1.5, labelsize=8)
plt.yscale('log', basey=10)
plt.tight_layout()
plt.margins(0.003,0.003)
plt.grid(linestyle='-', linewidth='0.1')
In [97]:
# Retrive chromosomes X, Y, 9 and autosomal genes for X to A ratio
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
chr9_genes = allgenes["Chromosome"]=="9"
chrA_genes = allgenes[~allgenes.Chromosome.isin(["X","Y", "MT"])]
In [98]:
# Retrieve the X and Y genes specifically present in the germ cells alone
germ_Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_9genes = set(allgenes[chr9_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_Agenes = set(chrA_genes.index.tolist()).intersection(ms_sce_germ.var_names.tolist())
In [99]:
ms_sce_germ.obs["Chr X"] = np.mean(
    ms_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1

ms_sce_germ.obs["Chr Y"] = np.mean(
    ms_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1

ms_sce_germ.obs["Chr 9"] = np.mean(
    ms_sce_germ[:, list(germ_9genes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
In [100]:
germXA_data = ms_sce_germ.obs["germAD_clusters"].to_frame().join(ms_sce_germ.obs["Chr 9"]).join(ms_sce_germ.obs["Chr X"])#.join(ms_sce_germ.obs["Chr Y"])
# Melt the data to long shape
germXA_data_melted = germXA_data.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
In [101]:
from scipy import stats
stats.mannwhitneyu(germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Diff / '])) & (germXA_data_melted['key'].isin(['Chr X']))].value,
              germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Zyg1'])) & (germXA_data_melted['key'].isin(['Chr X']))].value)
Out[101]:
MannwhitneyuResult(statistic=0.0, pvalue=0.0)
In [102]:
from matplotlib.ticker import FormatStrFormatter

plt.rcParams["figure.figsize"]=6,3
bx = sns.violinplot(x="germAD_clusters", y="value",
            hue="key", palette=["#0D8DB3","#C70039"],fliersize=0.1,linewidth=0.5,split=True,scale="count",inner="quartile",
            data=germXA_data_melted)
bx.set(xlabel="")
bx.set_ylabel("Chromosome:Autosome ratio", fontsize=8)
bx.set_xticklabels(bx.get_xticklabels(), rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor", visible=True)
bx.grid(linestyle='-', linewidth='0.1')
bx.tick_params(
    axis="both",          # changes apply to the x-axis
    which="major",      # both major and minor ticks are affected
    bottom=True,      # ticks along the bottom edge are off
    top=False,  left=True,       # ticks along the top edge are off
    labelbottom=True,labeltop=False, pad=0.5, length=1.5)
bx.axhline(y=0.5,color='gray',linestyle='--')
bx.axhline(y=1,color='gray',linestyle='--')
plt.legend(loc="upper right", fontsize=6, frameon=False)
plt.tight_layout()
In [103]:
ms_sce_germ.obs["ChrX percent"] = (np.sum(
    ms_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.sum(ms_sce_germ.X, axis=1).A1)*100

ms_sce_germ.obs["ChrY percent"] = (np.sum(
    ms_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.sum(ms_sce_germ.X, axis=1).A1)*100
In [104]:
# Make a dataframe using selected columns
germ_dist_data = ms_sce_germ.obs['germAD_clusters'].to_frame().join(ms_sce_germ.obs['detected']).join(ms_sce_germ.obs['sum']).join(ms_sce_germ.obs['subsets_mt_percent']).join(ms_sce_germ.obs['ChrX percent']).join(ms_sce_germ.obs['ChrY percent'])
germ_dist_data.columns = ['germAD_clusters','No. of genes', 'UMI Count', '% Mito', '% ChrX', '% ChrY']
# Melt the data to long shape
germ_dist_data_melted = germ_dist_data.melt(id_vars='germAD_clusters', var_name='key', value_name='value')
In [105]:
g = sns.FacetGrid(germ_dist_data_melted, col="key", height=6, sharex=False, hue="germAD_clusters", aspect=.4, palette=cols_clusters) 
g.map(sns.violinplot, "value", "germAD_clusters", label='xxlarge', linewidth=0.01).set_titles("{col_name}").set_axis_labels("","")
Out[105]:
<seaborn.axisgrid.FacetGrid at 0x2aafc013faf0>

Pseudotime analysis

In [106]:
sc.pp.neighbors(ms_sce_germ, n_pcs=50, knn=False, method="gauss")# Find the neighbors
sc.tl.diffmap(ms_sce_germ)
computing neighbors
WARNING: Using high n_obs without `knn=True` takes a lot of memory...
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:49)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:01:26)
    eigenvalues of transition matrix
    [1.         0.9996877  0.9987416  0.9973536  0.99525005 0.9931277
     0.98995364 0.986554   0.982538   0.9767902  0.97121197 0.96602124
     0.95890343 0.95173347 0.94572175]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:02:00)
In [107]:
ms_sce_germ.uns["iroot"] = np.flatnonzero(ms_sce_germ.obs["germAD_clusters"]  == "Undiff1")[0] # Set the root cell
sc.tl.dpt(ms_sce_germ, n_dcs=2)
computing Diffusion Pseudotime using n_dcs=2
    finished: added
    'dpt_pseudotime', the pseudotime (adata.obs) (0:00:00)
In [111]:
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":12,"axes.titlesize":12})
In [112]:
sc.pl.tsne(ms_sce_germ, color=["dpt_pseudotime"], legend_fontsize=8, cmap= "viridis",
           size=10, edgecolor="black", linewidths=0.1) #Use gauss method to find neighbors to get proper pseudotime 

Analysis of mouse sex chromosome genes in germ cells

In [113]:
# Load chr X gene annnotation file
Xgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Mouse/Mouse_Xgenes_Annotated_updated.csv")
In [114]:
Xgenes.index = Xgenes['gene_name'].tolist()
In [115]:
Xgenes['GeneClass'].value_counts()
Out[115]:
Single-copy    777
Multicopy      203
Ampliconic     108
Name: GeneClass, dtype: int64
In [116]:
Xgenes[['Ancestral', 'Shared']] = Xgenes[['Ancestral','Shared']].fillna(value="NotShared")
In [117]:
germXsingle_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Single-copy"])].tolist()]
germXsingle_sce.obs_names_make_unique()
germXMulti_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Multicopy"])].tolist()]
germXMulti_sce.obs_names_make_unique()
germXAmpliconic_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Ampliconic"])].tolist()]
germXAmpliconic_sce.obs_names_make_unique()
germY_sce = ms_sce_germ[:,list(germ_Ygenes)]
germY_sce.obs_names_make_unique()

sc.tl.rank_genes_groups(germXsingle_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXMulti_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXAmpliconic_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germY_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);


singleX_L1 = []
for i in germXsingle_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXsingle_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    singleX_L1.extend(genes)
    
MultiX_L1 = []
for i in germXMulti_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXMulti_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    MultiX_L1.extend(genes)
    
Ampli_L1 = []
for i in germXAmpliconic_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXAmpliconic_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    Ampli_L1.extend(genes)
    
singleY_L1 = []
for i in germY_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germY_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    singleY_L1.extend(genes)

singleX_unique = pd.Series(singleX_L1).drop_duplicates().tolist()
MultiX_unique = pd.Series(MultiX_L1).drop_duplicates().tolist()
Ampli_unique = pd.Series(Ampli_L1).drop_duplicates().tolist()
singleY_unique = pd.Series(singleY_L1).drop_duplicates().tolist()

print("No. of single X copy genes: %d" %len(singleX_unique))
print("No. of multi X copy genes: %d" %len(MultiX_unique))
print("No. of ampliconic X genes: %d" %len(Ampli_unique))
print("No. of Y genes: %d" %len(singleY_unique))
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:06)
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)
No. of single X copy genes: 692
No. of multi X copy genes: 145
No. of ampliconic X genes: 95
No. of Y genes: 184
In [118]:
XYgenes = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique, 'Y':singleY_unique}
In [119]:
X_sma = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique}

X chromsome genes (Single, multi-copy and ampliconic) gene expression

In [120]:
ax = sc.pl.matrixplot(ms_sce_germ, X_sma, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.01,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)

Multi-copy and Ampliconic gene expression

In [121]:
ax = sc.pl.matrixplot(ms_sce_germ, {'Multi':MultiX_unique, 'Ampli':Ampli_unique}, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
In [122]:
Xgenes_ms = Xgenes[Xgenes['gene_name'].isin(ms_sce_germ.var['Symbol'])]
In [123]:
Xgenes_ms['GeneClass'].value_counts()
Out[123]:
Single-copy    692
Multicopy      145
Ampliconic      95
Name: GeneClass, dtype: int64
In [124]:
pd.crosstab(Xgenes_ms['GeneClass'], Xgenes_ms['Shared'])
Out[124]:
Shared NotShared Shared_hs_mm Shared_hs_mm_mf
GeneClass
Ampliconic 42 20 33
Multicopy 56 39 50
Single-copy 164 46 482
In [125]:
MultiX_unique_shared = {}
for i in Xgenes_ms['Shared'].astype('category').cat.categories:
    genes = [x for x in MultiX_unique if x in Xgenes_ms['gene_name'][(Xgenes_ms['GeneClass']=='Multicopy') & (Xgenes_ms['Shared']==i)].tolist()]
    MultiX_unique_shared[i] = genes

Multi-copy gene expression along with its conservation type

In [126]:
ax = sc.pl.matrixplot(ms_sce_germ, MultiX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.001,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
In [127]:
AmpliX_unique_shared = {}
for i in Xgenes_ms['Shared'].astype('category').cat.categories:
    genes = [x for x in Ampli_unique if x in Xgenes_ms['gene_name'][(Xgenes_ms['GeneClass']=='Ampliconic') & (Xgenes_ms['Shared']==i)].tolist()]
    AmpliX_unique_shared[i] = genes

Ampliconic gene expression along with its conservation type

In [128]:
ax = sc.pl.matrixplot(ms_sce_germ, AmpliX_unique_shared, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.001,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)

Y chromosome gene expression as dotplot

In [129]:
sc.pl.dotplot(ms_sce_germ,singleY_unique, groupby="Annotated", figsize=(18,6), standard_scale="var",color_map=heatcolors_wr,
                 dendrogram=False, layer="logcounts")# Y genes
Out[129]:
GridSpec(2, 5, height_ratios=[0, 10.5], width_ratios=[17.05, 0, 0.2, 0.5, 0.25])

Analysis of ortholog and species-specific gene expression in germ cells

In [130]:
ms_unique = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Mm_unique.txt")
In [131]:
Orthologs = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/Human_Mouse_Monkey_orthologs_21Feb_clean.txt", sep=" ")

Orthologs = Orthologs[(Orthologs.Crab_eating_macaque_homology_type=="ortholog_one2one") & (Orthologs.Mouse_homology_type=="ortholog_one2one")]
In [132]:
mm_ortho_df = pd.DataFrame(columns=["1:1:1 Othologs", "Mouse_specific", "Other orthologs"])
In [ ]:
for i in ms_sce_germ.obs['germAD_clusters'].astype('category').cat.categories.tolist():
    df = i+"_df"
    df = ms_sce_germ[ms_sce_germ.obs['germAD_clusters'].isin([i]),:];
    #df = df[:,list(germ_Xgenes)]
    sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
    mm_ortho_df.loc[i] = [len(df.var['ID'][df.var['ID'].isin(Orthologs.Mouse_gene_stable_ID.tolist())])/len(df.var['ID'])*100,
                          len(df.var['ID'][df.var['ID'].isin(ms_unique['x'].tolist())])/len(df.var['ID'])*100,
                         len(df.var['ID'][~df.var['ID'].isin(Orthologs.Mouse_gene_stable_ID.tolist()+ms_unique['x'].tolist())])/len(df.var['ID'])*100]
In [134]:
mm_ortho_df['Groups'] = mm_ortho_df.index
In [135]:
plt.rcParams["figure.figsize"]=3,10
mm_ortho_df_melted = mm_ortho_df.melt('Groups', var_name='Type',  value_name='value')
g = sns.catplot(x="Groups", y="value", hue='Type', data=mm_ortho_df_melted, kind="point",
                palette = ["#DF8F44FF" , "#D9C7A5", "#80796BFF"],s=0.1, height=5, aspect=1.3, scale = 0.6)
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor");
plt.xlabel("")
plt.ylabel("% Genes expressed", fontsize=10);